QOL Healthcare Utilization (VSURF and Analysis of Deviance)

Author

Miguel Fudolig

Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)

Setup

Code
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
Code
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Unmet Health Need

VSURF Approach

Important

We will be using the interpretation set to run an analysis of deviance to check model performance.

Code
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Health.Need)
Characteristic N = 1,9741
Unmet.Health.Need
    0 1,770 (90%)
    Yes 204 (10%)
1 n (%)
Code
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Unmet.Health.Need,
                                   p=training_prop,
                                   list=F)

training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]

training_set |> gtsummary::tbl_summary(include=Unmet.Health.Need)
Characteristic N = 1,5801
Unmet.Health.Need
    0 1,416 (90%)
    Yes 164 (10%)
1 n (%)
Code
test_set |> gtsummary::tbl_summary(include=Unmet.Health.Need)
Characteristic N = 3941
Unmet.Health.Need
    0 354 (90%)
    Yes 40 (10%)
1 n (%)
Code
n_minority <- training_set |> filter(Unmet.Health.Need=="Yes")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = training_set[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Unmet.Health.Need~.,
      training_set,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Unmet.Health.Need ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 6.7 secs 

 VSURF selected: 
    18 variables at thresholding step (in 3.8 secs)
    1 variables at interpretation step (in 2.5 secs)
    1 variables at prediction step (in 0.5 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.106962

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.01080       0.00041 black
2                English.Speaking    0.01053       0.00050 black
3                             Age    0.00873       0.00065 black
4                  Discrimination    0.00850       0.00045 black
5                       Ethnicity    0.00641       0.00051   red
6            English.Difficulties    0.00615       0.00045 black
7                Dental.Insurance    0.00527       0.00055 black
8                        Religion    0.00385       0.00041 black
9                       Belonging    0.00261       0.00037 black
10       Familiarity.with.America    0.00194       0.00050 black
11                        US.Born    0.00176       0.00023 black
12                  Income_median    0.00155       0.00037 black
13           Full.Time.Employment    0.00143       0.00024 black
14            Identify.Ethnically    0.00133       0.00036 black
15               Health.Insurance    0.00124       0.00038 black
16               Primary.Language    0.00101       0.00030 black
17                         Gender    0.00070       0.00025 black
18 Familiarity.with.Ethnic.Origin    0.00066       0.00037 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")

Analysis of Deviance Using the Interpretation Set

Code
all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.interp]


pred_vars <- if("Ethnicity" %in% pred_vars){
  pred_vars
} else {
  c(pred_vars,"Ethnicity")
  }
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")

options(contrasts = c("contr.sum","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=training_set)
mod2 <- glm(mod_form_noEth, family="binomial", data=training_set)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Health.Need ~ Duration.of.Residency
Model 2: Unmet.Health.Need ~ Duration.of.Residency + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1578     1052.5                          
2      1573     1006.8  5   45.709 1.041e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1058.393        1067.276
Code
car::Anova(mod1, test.statistic = "F")
Analysis of Deviance Table (Type II tests)

Response: Unmet.Health.Need
Error estimate based on Pearson residuals 

                       Sum Sq   Df F value    Pr(>F)    
Duration.of.Residency    5.20    1  5.0129    0.0253 *  
Ethnicity               45.71    5  8.8196 2.882e-08 ***
Residuals             1630.47 1573                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = training_set)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.131584   0.155408 -13.716  < 2e-16 ***
Duration.of.Residency -0.016066   0.007193  -2.234  0.02551 *  
Ethnicity1             0.185655   0.181354   1.024  0.30597    
Ethnicity2            -0.791520   0.248527  -3.185  0.00145 ** 
Ethnicity3            -0.376729   0.303650  -1.241  0.21473    
Ethnicity4             0.545289   0.177129   3.078  0.00208 ** 
Ethnicity5            -0.465909   0.393166  -1.185  0.23601    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1053.4  on 1579  degrees of freedom
Residual deviance: 1006.8  on 1573  degrees of freedom
AIC: 1020.8

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.0996 0.0154 Inf    0.0732     0.134
 Asian Indian 0.0400 0.0104 Inf    0.0240     0.066
 Filipino     0.0593 0.0192 Inf    0.0311     0.110
 Korean       0.1369 0.0194 Inf    0.1030     0.180
 Other        0.0545 0.0237 Inf    0.0229     0.124
 Vietnamese   0.1849 0.0228 Inf    0.1443     0.234

Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")
 contrast            estimate    SE  df z.ratio p.value
 Chinese effect         0.186 0.181 Inf   1.024  0.3060
 Asian Indian effect   -0.792 0.249 Inf  -3.185  0.0042
 Filipino effect       -0.377 0.304 Inf  -1.241  0.2832
 Korean effect          0.545 0.177 Inf   3.078  0.0042
 Other effect          -0.466 0.393 Inf  -1.185  0.2832
 Vietnamese effect      0.903 0.170 Inf   5.301  <.0001

Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 6 tests 

Test Set Prediction with Ethnicity

Code
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Health.Need,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Unmet.Health.Need, predictor = predictions)

Data: predictions in 354 controls (test_set$Unmet.Health.Need 0) < 40 cases (test_set$Unmet.Health.Need Yes).
Area under the curve: 0.586
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.1228186
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Health.Need,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   226 128
       Yes  18  22
                                          
               Accuracy : 0.6294          
                 95% CI : (0.5797, 0.6773)
    No Information Rate : 0.6193          
    P-Value [Acc > NIR] : 0.3596          
                                          
                  Kappa : 0.0849          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.14667         
            Specificity : 0.92623         
         Pos Pred Value : 0.55000         
         Neg Pred Value : 0.63842         
             Prevalence : 0.38071         
         Detection Rate : 0.05584         
   Detection Prevalence : 0.10152         
      Balanced Accuracy : 0.53645         
                                          
       'Positive' Class : Yes             
                                          

Test Set Prediction without Ethnicity

Code
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Health.Need,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Unmet.Health.Need, predictor = predictions)

Data: predictions in 354 controls (test_set$Unmet.Health.Need 0) < 40 cases (test_set$Unmet.Health.Need Yes).
Area under the curve: 0.5696
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.09568894
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Health.Need,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    56 298
       Yes   1  39
                                          
               Accuracy : 0.2411          
                 95% CI : (0.1997, 0.2865)
    No Information Rate : 0.8553          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.031           
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.11573         
            Specificity : 0.98246         
         Pos Pred Value : 0.97500         
         Neg Pred Value : 0.15819         
             Prevalence : 0.85533         
         Detection Rate : 0.09898         
   Detection Prevalence : 0.10152         
      Balanced Accuracy : 0.54909         
                                          
       'Positive' Class : Yes             
                                          

LOOCV approach

Code
unregister_dopar <- function() {
  env <- foreach:::.foreachGlobals
  rm(list=ls(name=env), pos=env)
}
unregister_dopar()

With ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Unmet.Health.Need~., data = rfdata, method = "rf", trControl = ctrl)
summary(model)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted       1974   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes           3948   matrix     numeric  
oob.times       1974   -none-     numeric  
classes            2   -none-     character
importance        39   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y               1974   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            39   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
Code
model |> print()
Random Forest 

1974 samples
  18 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1776, 1776, 1777, 1776, 1777, 1777, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa     
   2    0.8966621  0.00000000
  20    0.8981823  0.11232402
  39    0.8936189  0.08244707

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 20.

No ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Unmet.Health.Need~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()
Random Forest 

1974 samples
  17 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1776, 1777, 1777, 1777, 1776, 1777, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa     
   2    0.8966621  0.00000000
  18    0.8961596  0.06704321
  34    0.8941291  0.07586865

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

Unmet Dental Need

VSURF Approach

Code
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)
Characteristic N = 1,9681
Unmet.Dental.Needs
    0 1,744 (89%)
    Yes 224 (11%)
1 n (%)
Code
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Unmet.Dental.Needs,
                                   p=training_prop,
                                   list=F)

training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]

training_set |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)
Characteristic N = 1,5761
Unmet.Dental.Needs
    0 1,396 (89%)
    Yes 180 (11%)
1 n (%)
Code
test_set |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)
Characteristic N = 3921
Unmet.Dental.Needs
    0 348 (89%)
    Yes 44 (11%)
1 n (%)
Code
n_minority <- training_set |> filter(Unmet.Dental.Needs=="Yes")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = training_set[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Unmet.Dental.Needs~.,
      training_set,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Unmet.Dental.Needs ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 6.6 secs 

 VSURF selected: 
    17 variables at thresholding step (in 4 secs)
    1 variables at interpretation step (in 2.4 secs)
    1 variables at prediction step (in 0.2 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1159898

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.01434       0.00073 black
2                English.Speaking    0.01012       0.00058 black
3           Duration.of.Residency    0.00987       0.00064 black
4                             Age    0.00910       0.00067 black
5                        Religion    0.00570       0.00057 black
6        Familiarity.with.America    0.00398       0.00038 black
7                Primary.Language    0.00304       0.00032 black
8                       Ethnicity    0.00271       0.00043   red
9                   Income_median    0.00227       0.00043 black
10           English.Difficulties    0.00217       0.00049 black
11 Familiarity.with.Ethnic.Origin    0.00172       0.00043 black
12                        US.Born    0.00157       0.00018 black
13                 Discrimination    0.00156       0.00043 black
14                      Belonging    0.00135       0.00042 black
15            Identify.Ethnically    0.00129       0.00036 black
16           Full.Time.Employment    0.00104       0.00035 black
17               Health.Insurance    0.00095       0.00044 black
18                         Gender   -0.00015       0.00018 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")
Note

Dental Insurance is the only variable in the interpretation set, which means Ethnicity might not be a significant predictor of unmet dental needs.

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1966     1300.0                     
2      1961     1291.3  5   8.7216   0.1207
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1       1344.41        1315.208
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -2.11153    0.09115 -23.165   <2e-16 ***
Dental.Insurance1  0.69784    0.07851   8.889   <2e-16 ***
Ethnicity1        -0.12599    0.15545  -0.810   0.4177    
Ethnicity2        -0.31150    0.17526  -1.777   0.0755 .  
Ethnicity3         0.17410    0.21838   0.797   0.4253    
Ethnicity4         0.23328    0.14812   1.575   0.1153    
Ethnicity5        -0.17960    0.30374  -0.591   0.5543    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1395.0  on 1967  degrees of freedom
Residual deviance: 1291.3  on 1961  degrees of freedom
AIC: 1305.3

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.0964 0.0138 Inf    0.0726     0.127
 Asian Indian 0.0814 0.0139 Inf    0.0580     0.113
 Filipino     0.1259 0.0266 Inf    0.0823     0.188
 Korean       0.1326 0.0170 Inf    0.1026     0.170
 Other        0.0919 0.0297 Inf    0.0480     0.169
 Vietnamese   0.1299 0.0176 Inf    0.0991     0.168

Results are averaged over the levels of: Dental.Insurance 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")
 contrast            estimate    SE  df z.ratio p.value
 Chinese effect        -0.126 0.155 Inf  -0.810  0.5104
 Asian Indian effect   -0.311 0.175 Inf  -1.777  0.3458
 Filipino effect        0.174 0.218 Inf   0.797  0.5104
 Korean effect          0.233 0.148 Inf   1.575  0.3458
 Other effect          -0.180 0.304 Inf  -0.591  0.5543
 Vietnamese effect      0.210 0.154 Inf   1.361  0.3469

Results are averaged over the levels of: Dental.Insurance 
Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 6 tests 
Note

The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.

Test Set Prediction with Ethnicity

Code
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Dental.Needs,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Unmet.Dental.Needs, predictor = predictions)

Data: predictions in 348 controls (test_set$Unmet.Dental.Needs 0) < 44 cases (test_set$Unmet.Dental.Needs Yes).
Area under the curve: 0.6702
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.1109492
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Dental.Needs,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   226 122
       Yes  15  29
                                         
               Accuracy : 0.6505         
                 95% CI : (0.601, 0.6977)
    No Information Rate : 0.6148         
    P-Value [Acc > NIR] : 0.07996        
                                         
                  Kappa : 0.1496         
                                         
 Mcnemar's Test P-Value : < 2e-16        
                                         
            Sensitivity : 0.19205        
            Specificity : 0.93776        
         Pos Pred Value : 0.65909        
         Neg Pred Value : 0.64943        
             Prevalence : 0.38520        
         Detection Rate : 0.07398        
   Detection Prevalence : 0.11224        
      Balanced Accuracy : 0.56491        
                                         
       'Positive' Class : Yes            
                                         

Test Set Prediction without Ethnicity

Code
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Dental.Needs,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Unmet.Dental.Needs, predictor = predictions)

Data: predictions in 348 controls (test_set$Unmet.Dental.Needs 0) < 44 cases (test_set$Unmet.Dental.Needs Yes).
Area under the curve: 0.6543
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.1291447
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Dental.Needs,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   226 122
       Yes  15  29
                                         
               Accuracy : 0.6505         
                 95% CI : (0.601, 0.6977)
    No Information Rate : 0.6148         
    P-Value [Acc > NIR] : 0.07996        
                                         
                  Kappa : 0.1496         
                                         
 Mcnemar's Test P-Value : < 2e-16        
                                         
            Sensitivity : 0.19205        
            Specificity : 0.93776        
         Pos Pred Value : 0.65909        
         Neg Pred Value : 0.64943        
             Prevalence : 0.38520        
         Detection Rate : 0.07398        
   Detection Prevalence : 0.11224        
      Balanced Accuracy : 0.56491        
                                         
       'Positive' Class : Yes            
                                         

LOOCV approach

With ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Unmet.Dental.Needs~., data = rfdata, method = "rf", trControl = ctrl)
summary(model)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted       1968   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes           3936   matrix     numeric  
oob.times       1968   -none-     numeric  
classes            2   -none-     character
importance        39   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y               1968   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            39   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
Code
model |> print()
Random Forest 

1968 samples
  18 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1771, 1772, 1772, 1770, 1771, 1772, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa     
   2    0.8861825  0.00000000
  20    0.8806012  0.02161838
  39    0.8795808  0.03184641

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

No ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Unmet.Dental.Needs~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()
Random Forest 

1968 samples
  17 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1772, 1772, 1770, 1771, 1772, 1771, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa     
   2    0.8861844  0.00000000
  18    0.8815952  0.04198948
  34    0.8790571  0.04239551

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

Physical Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)
Characteristic N = 1,9661
Physical.Check.up
    0 630 (32%)
    Yes 1,336 (68%)
1 n (%)
Code
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Physical.Check.up,
                                   p=training_prop,
                                   list=F)

training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]

training_set |> gtsummary::tbl_summary(include=Physical.Check.up)
Characteristic N = 1,5731
Physical.Check.up
    0 504 (32%)
    Yes 1,069 (68%)
1 n (%)
Code
test_set |> gtsummary::tbl_summary(include=Physical.Check.up)
Characteristic N = 3931
Physical.Check.up
    0 126 (32%)
    Yes 267 (68%)
1 n (%)
Code
n_minority <- training_set |> filter(Physical.Check.up=="0")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = training_set[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Physical.Check.up~.,
      training_set,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Physical.Check.up ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 8.6 secs 

 VSURF selected: 
    16 variables at thresholding step (in 4.7 secs)
    3 variables at interpretation step (in 2.7 secs)
    2 variables at prediction step (in 1.1 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "Health.Insurance"     
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "Age"                   "Health.Insurance"     
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.2933566
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.03041       0.00086 black
2                             Age    0.02406       0.00115 black
3                Health.Insurance    0.01988       0.00062 black
4                Dental.Insurance    0.01257       0.00075 black
5                       Ethnicity    0.00767       0.00084   red
6                   Income_median    0.00571       0.00044 black
7                        Religion    0.00514       0.00052 black
8                     EnglishDiff    0.00469       0.00072 black
9                    EnglishSpeak    0.00397       0.00050 black
10 Familiarity.with.Ethnic.Origin    0.00373       0.00056 black
11                     Employment    0.00313       0.00031 black
12       Familiarity.with.America    0.00202       0.00048 black
13                         Gender    0.00129       0.00047 black
14                 Discrimination    0.00087       0.00035 black
15               Primary.Language    0.00081       0.00030 black
16                      Belonging    0.00067       0.00050 black
17                        US.Born    0.00021       0.00027 black
18            Identify.Ethnically   -0.00028       0.00044 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance
Model 2: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance + 
    Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1962     2198.1                          
2      1957     2155.8  5    42.35 5.004e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      2224.021        2228.452
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.974225   0.165135  -5.900 3.64e-09 ***
Duration.of.Residency  0.039534   0.005388   7.337 2.18e-13 ***
Age                    0.015793   0.003846   4.107 4.01e-05 ***
Health.Insurance1     -0.791580   0.074487 -10.627  < 2e-16 ***
Ethnicity1            -0.055909   0.105985  -0.528   0.5978    
Ethnicity2             0.185691   0.112516   1.650   0.0989 .  
Ethnicity3             0.462318   0.161909   2.855   0.0043 ** 
Ethnicity4            -0.583407   0.110034  -5.302 1.14e-07 ***
Ethnicity5            -0.279414   0.189871  -1.472   0.1411    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2466.2  on 1965  degrees of freedom
Residual deviance: 2155.8  on 1957  degrees of freedom
AIC: 2173.8

Number of Fisher Scoring iterations: 4
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity     prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.563 0.0292 Inf     0.505     0.619
 Asian Indian 0.621 0.0306 Inf     0.559     0.679
 Filipino     0.683 0.0413 Inf     0.598     0.758
 Korean       0.431 0.0303 Inf     0.373     0.491
 Other        0.507 0.0564 Inf     0.398     0.615
 Vietnamese   0.641 0.0319 Inf     0.576     0.700

Results are averaged over the levels of: Health.Insurance 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")
 contrast            estimate    SE  df z.ratio p.value
 Chinese effect       -0.0559 0.106 Inf  -0.528  0.5978
 Asian Indian effect   0.1857 0.113 Inf   1.650  0.1483
 Filipino effect       0.4623 0.162 Inf   2.855  0.0129
 Korean effect        -0.5834 0.110 Inf  -5.302  <.0001
 Other effect         -0.2794 0.190 Inf  -1.472  0.1694
 Vietnamese effect     0.2707 0.124 Inf   2.184  0.0579

Results are averaged over the levels of: Health.Insurance 
Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 6 tests 

Test Set Prediction with Ethnicity

Code
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Physical.Check.up,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Physical.Check.up, predictor = predictions)

Data: predictions in 126 controls (test_set$Physical.Check.up 0) < 267 cases (test_set$Physical.Check.up Yes).
Area under the curve: 0.7751
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.6782272
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Physical.Check.up,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    94  32
       Yes  81 186
                                         
               Accuracy : 0.7125         
                 95% CI : (0.665, 0.7567)
    No Information Rate : 0.5547         
    P-Value [Acc > NIR] : 9.406e-11      
                                         
                  Kappa : 0.4014         
                                         
 Mcnemar's Test P-Value : 6.318e-06      
                                         
            Sensitivity : 0.8532         
            Specificity : 0.5371         
         Pos Pred Value : 0.6966         
         Neg Pred Value : 0.7460         
             Prevalence : 0.5547         
         Detection Rate : 0.4733         
   Detection Prevalence : 0.6794         
      Balanced Accuracy : 0.6952         
                                         
       'Positive' Class : Yes            
                                         

Test Set Prediction without Ethnicity

Code
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Physical.Check.up,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Physical.Check.up, predictor = predictions)

Data: predictions in 126 controls (test_set$Physical.Check.up 0) < 267 cases (test_set$Physical.Check.up Yes).
Area under the curve: 0.7543
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.677257
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Physical.Check.up,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    94  32
       Yes  83 184
                                          
               Accuracy : 0.7074          
                 95% CI : (0.6597, 0.7519)
    No Information Rate : 0.5496          
    P-Value [Acc > NIR] : 1.033e-10       
                                          
                  Kappa : 0.3932          
                                          
 Mcnemar's Test P-Value : 3.124e-06       
                                          
            Sensitivity : 0.8519          
            Specificity : 0.5311          
         Pos Pred Value : 0.6891          
         Neg Pred Value : 0.7460          
             Prevalence : 0.5496          
         Detection Rate : 0.4682          
   Detection Prevalence : 0.6794          
      Balanced Accuracy : 0.6915          
                                          
       'Positive' Class : Yes             
                                          

LOOCV approach

With ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Physical.Check.up~., data = rfdata, method = "rf", trControl = ctrl)
summary(model)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted       1966   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes           3932   matrix     numeric  
oob.times       1966   -none-     numeric  
classes            2   -none-     character
importance        39   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y               1966   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            39   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
Code
model |> print()
Random Forest 

1966 samples
  18 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1769, 1770, 1769, 1769, 1770, 1769, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7279188  0.2632947
  20    0.7202424  0.3081309
  39    0.7085543  0.2792593

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

No ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Physical.Check.up~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()
Random Forest 

1966 samples
  17 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1770, 1769, 1770, 1769, 1769, 1770, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7085621  0.2076427
  18    0.7141485  0.2952116
  34    0.7090671  0.2868507

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 18.

Dental Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)
Characteristic N = 1,9611
Dentist.Check.up
    0 811 (41%)
    Yes 1,150 (59%)
1 n (%)
Code
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Dentist.Check.up,
                                   p=training_prop,
                                   list=F)

training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]

training_set |>  gtsummary::tbl_summary(include=Dentist.Check.up)
Characteristic N = 1,5691
Dentist.Check.up
    0 649 (41%)
    Yes 920 (59%)
1 n (%)
Code
test_set |>  gtsummary::tbl_summary(include=Dentist.Check.up)
Characteristic N = 3921
Dentist.Check.up
    0 162 (41%)
    Yes 230 (59%)
1 n (%)
Code
n_minority <- training_set |> filter(Dentist.Check.up=="0") |> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = training_set[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Dentist.Check.up~.,
      training_set,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Dentist.Check.up ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 8.3 secs 

 VSURF selected: 
    18 variables at thresholding step (in 4.6 secs)
    3 variables at interpretation step (in 3 secs)
    1 variables at prediction step (in 0.8 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"      "Duration.of.Residency" "Religion"             
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.2981517
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.05190       0.00096 black
2           Duration.of.Residency    0.04496       0.00083 black
3                        Religion    0.01423       0.00078 black
4                       Ethnicity    0.01252       0.00063   red
5                             Age    0.00839       0.00080 black
6                    EnglishSpeak    0.00601       0.00076 black
7                   Income_median    0.00590       0.00051 black
8                Health.Insurance    0.00571       0.00055 black
9             Identify.Ethnically    0.00390       0.00045 black
10       Familiarity.with.America    0.00329       0.00044 black
11                         Gender    0.00302       0.00043 black
12 Familiarity.with.Ethnic.Origin    0.00271       0.00050 black
13                     Employment    0.00208       0.00039 black
14                    EnglishDiff    0.00185       0.00024 black
15               Primary.Language    0.00160       0.00033 black
16                      Belonging    0.00158       0.00045 black
17                        US.Born    0.00098       0.00023 black
18                 Discrimination    0.00088       0.00042 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency + 
    Religion
Model 2: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency + 
    Religion + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1952     2209.7                     
2      1947     2201.0  5   8.7418   0.1198
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      2307.098        2277.933
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.542307   0.115283  -4.704 2.55e-06 ***
Dental.Insurance1     -0.804673   0.054352 -14.805  < 2e-16 ***
Duration.of.Residency  0.045686   0.004692   9.738  < 2e-16 ***
Religion1              0.098471   0.158743   0.620    0.535    
Religion2              0.344886   0.178252   1.935    0.053 .  
Religion3              0.225075   0.166045   1.356    0.175    
Religion4             -0.193586   0.240057  -0.806    0.420    
Religion5             -0.152920   0.312546  -0.489    0.625    
Religion6             -0.423634   0.345307  -1.227    0.220    
Ethnicity1             0.272980   0.132916   2.054    0.040 *  
Ethnicity2            -0.225308   0.241135  -0.934    0.350    
Ethnicity3             0.197420   0.177079   1.115    0.265    
Ethnicity4            -0.141224   0.137876  -1.024    0.306    
Ethnicity5            -0.045418   0.202181  -0.225    0.822    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2659.6  on 1960  degrees of freedom
Residual deviance: 2201.0  on 1947  degrees of freedom
AIC: 2229

Number of Fisher Scoring iterations: 4
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity     prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.612 0.0388 Inf     0.534     0.685
 Asian Indian 0.489 0.0555 Inf     0.383     0.597
 Filipino     0.594 0.0520 Inf     0.489     0.690
 Korean       0.510 0.0431 Inf     0.426     0.594
 Other        0.534 0.0590 Inf     0.419     0.646
 Vietnamese   0.531 0.0415 Inf     0.450     0.611

Results are averaged over the levels of: Dental.Insurance, Religion 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T)
 contrast            odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio
 Chinese effect           1.314 0.175 Inf     0.925      1.87    1   2.054
 Asian Indian effect      0.798 0.192 Inf     0.423      1.51    1  -0.934
 Filipino effect          1.218 0.216 Inf     0.764      1.94    1   1.115
 Korean effect            0.868 0.120 Inf     0.604      1.25    1  -1.024
 Other effect             0.956 0.193 Inf     0.561      1.63    1  -0.225
 Vietnamese effect        0.943 0.130 Inf     0.656      1.36    1  -0.425
 p.value
  0.2400
  0.5252
  0.5252
  0.5252
  0.8223
  0.8048

Results are averaged over the levels of: Dental.Insurance, Religion 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
Intervals are back-transformed from the log odds ratio scale 
P value adjustment: fdr method for 6 tests 
Tests are performed on the log odds ratio scale 

Test Set Prediction with Ethnicity

Code
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Dentist.Check.up,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Dentist.Check.up, predictor = predictions)

Data: predictions in 162 controls (test_set$Dentist.Check.up 0) < 230 cases (test_set$Dentist.Check.up Yes).
Area under the curve: 0.7827
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.5187899
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Dentist.Check.up,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   106  56
       Yes  41 189
                                          
               Accuracy : 0.7526          
                 95% CI : (0.7067, 0.7945)
    No Information Rate : 0.625           
    P-Value [Acc > NIR] : 5.29e-08        
                                          
                  Kappa : 0.4827          
                                          
 Mcnemar's Test P-Value : 0.1552          
                                          
            Sensitivity : 0.7714          
            Specificity : 0.7211          
         Pos Pred Value : 0.8217          
         Neg Pred Value : 0.6543          
             Prevalence : 0.6250          
         Detection Rate : 0.4821          
   Detection Prevalence : 0.5867          
      Balanced Accuracy : 0.7463          
                                          
       'Positive' Class : Yes             
                                          

Test Set Prediction without Ethnicity

Code
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Dentist.Check.up,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Dentist.Check.up, predictor = predictions)

Data: predictions in 162 controls (test_set$Dentist.Check.up 0) < 230 cases (test_set$Dentist.Check.up Yes).
Area under the curve: 0.7767
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.5253235
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Dentist.Check.up,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   105  57
       Yes  40 190
                                          
               Accuracy : 0.7526          
                 95% CI : (0.7067, 0.7945)
    No Information Rate : 0.6301          
    P-Value [Acc > NIR] : 1.581e-07       
                                          
                  Kappa : 0.4817          
                                          
 Mcnemar's Test P-Value : 0.1043          
                                          
            Sensitivity : 0.7692          
            Specificity : 0.7241          
         Pos Pred Value : 0.8261          
         Neg Pred Value : 0.6481          
             Prevalence : 0.6301          
         Detection Rate : 0.4847          
   Detection Prevalence : 0.5867          
      Balanced Accuracy : 0.7467          
                                          
       'Positive' Class : Yes             
                                          

LOOCV approach

With ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Dentist.Check.up~., data = rfdata, method = "rf", trControl = ctrl)
summary(model)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted       1961   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes           3922   matrix     numeric  
oob.times       1961   -none-     numeric  
classes            2   -none-     character
importance        39   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y               1961   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            39   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
Code
model |> print()
Random Forest 

1961 samples
  18 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1765, 1765, 1765, 1765, 1765, 1764, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7241091  0.4182409
  20    0.7037165  0.3884665
  39    0.7027064  0.3843739

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

No ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Dentist.Check.up~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()
Random Forest 

1961 samples
  17 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1765, 1765, 1765, 1765, 1765, 1764, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa    
   2    0.7210246  0.4118714
  18    0.7006319  0.3817943
  34    0.6934813  0.3665140

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

Folk medicine

VSURF

Code
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Folkmedicine)
Characteristic N = 1,9471
Folkmedicine
    0 1,681 (86%)
    Yes 266 (14%)
1 n (%)
Code
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Folkmedicine,
                                   p=training_prop,
                                   list=F)

training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]

training_set |> gtsummary::tbl_summary(include=Folkmedicine)
Characteristic N = 1,5581
Folkmedicine
    0 1,345 (86%)
    Yes 213 (14%)
1 n (%)
Code
test_set |> gtsummary::tbl_summary(include=Folkmedicine)
Characteristic N = 3891
Folkmedicine
    0 336 (86%)
    Yes 53 (14%)
1 n (%)
Code
n_minority <- training_set |> filter(Folkmedicine=="Yes") |> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = training_set[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Folkmedicine~.,
      training_set,
      na.action="na.omit",
      parallel=T,
      verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Folkmedicine ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 8.1 secs 

 VSURF selected: 
    18 variables at thresholding step (in 3.9 secs)
    3 variables at interpretation step (in 2.7 secs)
    2 variables at prediction step (in 1.5 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Age"       "Ethnicity"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Age"                   "Duration.of.Residency" "Ethnicity"            
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1379974

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                             Age    0.01810       0.00092 black
2           Duration.of.Residency    0.01312       0.00086 black
3                       Ethnicity    0.00808       0.00050   red
4                English.Speaking    0.00702       0.00055 black
5                   Income_median    0.00463       0.00036 black
6                        Religion    0.00401       0.00081 black
7            English.Difficulties    0.00256       0.00058 black
8            Full.Time.Employment    0.00239       0.00046 black
9        Familiarity.with.America    0.00198       0.00043 black
10 Familiarity.with.Ethnic.Origin    0.00180       0.00040 black
11               Primary.Language    0.00134       0.00031 black
12                 Discrimination    0.00117       0.00038 black
13               Health.Insurance    0.00089       0.00023 black
14               Dental.Insurance    0.00076       0.00019 black
15                        US.Born    0.00069       0.00016 black
16                         Gender    0.00044       0.00036 black
17                      Belonging    0.00033       0.00033 black
18            Identify.Ethnically    0.00027       0.00035 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp])
mod_form <- reformulate(pred_vars, response="Folkmedicine")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Folkmedicine ~ Age + Duration.of.Residency
Model 2: Folkmedicine ~ Age + Duration.of.Residency + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1944     1507.7                          
2      1939     1450.9  5    56.75 5.693e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1511.497        1530.377
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -3.135528   0.204648 -15.322  < 2e-16 ***
Age                    0.022611   0.004636   4.877 1.08e-06 ***
Duration.of.Residency  0.007417   0.005960   1.244   0.2134    
Ethnicity1             0.522926   0.131708   3.970 7.18e-05 ***
Ethnicity2            -0.286686   0.173196  -1.655   0.0979 .  
Ethnicity3            -0.219541   0.217195  -1.011   0.3121    
Ethnicity4             0.757469   0.133299   5.682 1.33e-08 ***
Ethnicity5            -0.212500   0.287863  -0.738   0.4604    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1552.9  on 1946  degrees of freedom
Residual deviance: 1450.9  on 1939  degrees of freedom
AIC: 1466.9

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.1741 0.0176 Inf    0.1422    0.2113
 Asian Indian 0.0857 0.0142 Inf    0.0617    0.1180
 Filipino     0.0912 0.0203 Inf    0.0585    0.1394
 Korean       0.2104 0.0208 Inf    0.1725    0.2541
 Other        0.0917 0.0280 Inf    0.0496    0.1634
 Vietnamese   0.0665 0.0125 Inf    0.0459    0.0955

Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T)
 contrast            odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio
 Chinese effect           1.687 0.222 Inf     1.192     2.388    1   3.970
 Asian Indian effect      0.751 0.130 Inf     0.475     1.186    1  -1.655
 Filipino effect          0.803 0.174 Inf     0.453     1.424    1  -1.011
 Korean effect            2.133 0.284 Inf     1.500     3.032    1   5.682
 Other effect             0.809 0.233 Inf     0.378     1.728    1  -0.738
 Vietnamese effect        0.570 0.105 Inf     0.351     0.927    1  -3.053
 p.value
  0.0002
  0.1468
  0.3745
  <.0001
  0.4604
  0.0045

Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
Intervals are back-transformed from the log odds ratio scale 
P value adjustment: fdr method for 6 tests 
Tests are performed on the log odds ratio scale 

Test Set Prediction with Ethnicity

Code
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Folkmedicine,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Folkmedicine, predictor = predictions)

Data: predictions in 336 controls (test_set$Folkmedicine 0) < 53 cases (test_set$Folkmedicine Yes).
Area under the curve: 0.6865
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.1925455
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Folkmedicine,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   273  63
       Yes  26  27
                                         
               Accuracy : 0.7712         
                 95% CI : (0.7262, 0.812)
    No Information Rate : 0.7686         
    P-Value [Acc > NIR] : 0.4803199      
                                         
                  Kappa : 0.2488         
                                         
 Mcnemar's Test P-Value : 0.0001356      
                                         
            Sensitivity : 0.30000        
            Specificity : 0.91304        
         Pos Pred Value : 0.50943        
         Neg Pred Value : 0.81250        
             Prevalence : 0.23136        
         Detection Rate : 0.06941        
   Detection Prevalence : 0.13625        
      Balanced Accuracy : 0.60652        
                                         
       'Positive' Class : Yes            
                                         

Test Set Prediction without Ethnicity

Code
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Folkmedicine,predictions)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
Code
ROC

Call:
roc.default(response = test_set$Folkmedicine, predictor = predictions)

Data: predictions in 336 controls (test_set$Folkmedicine 0) < 53 cases (test_set$Folkmedicine Yes).
Area under the curve: 0.5989
Code
ROC |> plot()

Code
threshold <- ROC  |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold
[1] 0.1389611
Code
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Folkmedicine,responsepredictions,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   224 112
       Yes  24  29
                                          
               Accuracy : 0.6504          
                 95% CI : (0.6007, 0.6978)
    No Information Rate : 0.6375          
    P-Value [Acc > NIR] : 0.3189          
                                          
                  Kappa : 0.1258          
                                          
 Mcnemar's Test P-Value : 8.64e-14        
                                          
            Sensitivity : 0.20567         
            Specificity : 0.90323         
         Pos Pred Value : 0.54717         
         Neg Pred Value : 0.66667         
             Prevalence : 0.36247         
         Detection Rate : 0.07455         
   Detection Prevalence : 0.13625         
      Balanced Accuracy : 0.55445         
                                          
       'Positive' Class : Yes             
                                          

LOOCV approach

With ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Folkmedicine~., data = rfdata, method = "rf", trControl = ctrl)
summary(model)
                Length Class      Mode     
call               4   -none-     call     
type               1   -none-     character
predicted       1947   factor     numeric  
err.rate        1500   -none-     numeric  
confusion          6   -none-     numeric  
votes           3894   matrix     numeric  
oob.times       1947   -none-     numeric  
classes            2   -none-     character
importance        39   -none-     numeric  
importanceSD       0   -none-     NULL     
localImportance    0   -none-     NULL     
proximity          0   -none-     NULL     
ntree              1   -none-     numeric  
mtry               1   -none-     numeric  
forest            14   -none-     list     
y               1947   factor     numeric  
test               0   -none-     NULL     
inbag              0   -none-     NULL     
xNames            39   -none-     character
problemType        1   -none-     character
tuneValue          1   data.frame list     
obsLevels          2   -none-     character
param              0   -none-     list     
Code
model |> print()
Random Forest 

1947 samples
  18 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1753, 1752, 1752, 1753, 1753, 1752, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa     
   2    0.8633836  0.00000000
  20    0.8592757  0.04589079
  39    0.8582474  0.05946169

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

No ethnicity

Code
unregister_dopar()
ctrl <- trainControl(method="cv")

model <- train(Folkmedicine~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()
Random Forest 

1947 samples
  17 predictor
   2 classes: '0', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1753, 1753, 1752, 1752, 1753, 1752, ... 
Resampling results across tuning parameters:

  mtry  Accuracy   Kappa     
   2    0.8633836  0.00000000
  18    0.8633862  0.07901580
  34    0.8602987  0.05914409

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 18.